#!/usr/bin/Rscript

library(rms)
library(rms.gof)
library(ggplot2)
library(stargazer)

dfcan = read.csv('canada_final_dataset.csv',sep='\t')
dfusa = read.csv('usa_final_dataset.csv',sep='\t')
dfusa$dow = factor(dfusa$dow)
dfusa$hour = factor(dfusa$hour)
dfcan$dow = factor(dfcan$dow)
dfcan$hour = factor(dfcan$hour)

# Models for Canada (Table 2)

options(datadist=NULL)
d = datadist(dfcan)  
options(datadist="d")

model1 = balanced_class ~ gender 
model2 = balanced_class ~ gender + log(follower_count)
model3 = balanced_class ~ gender*log(follower_count) 
model4 = balanced_class ~ gender*log(follower_count) + visible_minority + party + federal + dow + hour 

f1 = lrm(model1, data=dfcan, x=T, y=T)
f1a = robcov(f1, cluster=dfcan$handle)
f2 = lrm(model2, data=dfcan, x=T, y=T)
f2a = robcov(f2, cluster=dfcan$handle)
f3 = lrm(model3, data=dfcan, x=T, y=T)
f3a = robcov(f3, cluster=dfcan$handle)
f4 = lrm(model4, data=dfcan, x=T, y=T)
f4a = robcov(f4, cluster=dfcan$handle)

stargazer(f1a,f2a,f3a,f4a,star.cutoffs = c(0.05,0.01,0.001))

# Models for USA (Table 3)

options(datadist=NULL)
usad = datadist(dfusa) 
options(datadist="usad")

usamodel1 = balanced_class ~ gender 
usamodel2 = balanced_class ~ gender + log(follower_count)
usamodel3 = balanced_class ~ gender*log(follower_count)
usamodel4 = balanced_class ~ gender*log(follower_count) + visible_minority + party + state + dow + hour 

usaf1 = lrm(usamodel1, data=dfusa, x=T, y=T)
usaf1a = robcov(usaf1, cluster=dfusa$handle)
usaf2 = lrm(usamodel2, data=dfusa, x=T, y=T)
usaf2a = robcov(usaf2, cluster=dfusa$handle)
usaf3 = lrm(usamodel3, data=dfusa, x=T, y=T)
usaf3a = robcov(usaf3, cluster=dfusa$handle)
usaf4 = lrm(usamodel4, data=dfusa, x=T, y=T)
usaf4a = robcov(usaf4, cluster=dfusa$handle)

stargazer(usaf1a,usaf2a,usaf3a,usaf4a,star.cutoffs = c(0.05,0.01,0.001))

# Predicted probability plot (Figure 2, Canada)

canmodel3 = balanced_class ~ gender*log(follower_count)
f3can = lrm(canmodel3, data=dfcan, x=T, y=T)
ncan = data.frame(gender=c(0,1,0,1), cats=c('P','P','U','U'), follower_count=c(50000,50000,500000,500000))
ncan$prob = predict(f3can, newdata=ncan)
ncan$response = exp(ncan$prob)/(1+exp(ncan$prob))

pdf("canpredprob.pdf", width=7, height=5, paper='special')
g = ggplot(ncan, aes(x=cats, y=response, fill=factor(gender), stat="identity")) + 
  geom_col(width=.4, position = position_dodge(width=0.5)) +
  ylab("Predicted Probability of Uncivil Tweet") +
  xlab("Visibility") +
  ylim(c(0,0.2)) +
  scale_x_discrete(labels = c('50,000 Followers','0.5M Followers')) +
  scale_fill_manual(values=c("darkblue", "darkmagenta"), name = "Gender", 
                    labels=c("Male", "Female")) 
g
dev.off()

# Predicted probability plot (Figure 2, USA)

usamodel3 = balanced_class ~ gender*log(follower_count)
f3 = lrm(usamodel3, data=dfusa, x=T, y=T)
nusa = data.frame(gender=c(0,1,0,1), cats=c('P','P','U','U'), follower_count=c(500000,500000,5000000,5000000))
nusa$prob = predict(f3, newdata=nusa)
nusa$response = exp(nusa$prob)/(1+exp(nusa$prob))

pdf("usapredprob.pdf", width=7, height=5, paper='special')
g = ggplot(nusa, aes(x=cats, y=response, fill=factor(gender), stat="identity")) + 
  geom_col(width=.4, position = position_dodge(width=0.5)) +
  ylab("Predicted Probability of Uncivil Tweet") +
  xlab("Visibility") +
  ylim(c(0,0.2)) +
  scale_x_discrete(labels = c('0.5M Followers','5M Followers')) +
  scale_fill_manual(values=c("darkblue", "darkmagenta"), name = "Gender", 
                    labels=c("Male", "Female")) 
g
dev.off()
